library(dplyr)
library(EIAapi)
library(jsonlite)
library(gt)
library(plotly)
library(modeltime)
source("../pipeline/eia_data.R")
source("../pipeline/backtesting.R")Data Backfill
The goal of this doc is to execute an initial data pull of the hourly demand for California balancing authority subregion (CISO). This includes the following four independent system operators:
- Pacific Gas and Electric (PGAE)
- Southern California Edison (SCE)
- San Diego Gas and Electric (SDGE)
- Valley Electric Association (VEA)
The data backfill process includes the following steps:
- Setting parameters and pulling the data
- Data quality checks
- Saving the data and creating a log file
Load Libraries and Functions
meta_json <- read_json(path = "../settings/settings.json")
s <- meta_json$series
series <- lapply(1:length(s), function(i) {
return(data.frame(
parent_id = s[[i]]$parent_id,
parent_name = s[[i]]$parent_name,
subba_id = s[[i]]$subba_id,
subba_name = s[[i]]$subba_name
))
}) |>
bind_rows()
api_path <- meta_json$api_path
meta_path <- meta_json$meta_path
data_path <- meta_json$data_path
forecast_path <- meta_json$forecast_path
forecast_log_path <- meta_json$forecast_log_path
calibrated_models_path <- meta_json$calibrated_models_path
h <- meta_json$backtesting$h
lags <- meta_json$backtesting$features$lags |> unlist()
train_length <- meta_json$train_length
offset <- meta_json$offset
tz <- meta_json$timezone
models_settings <- meta_json$backtesting$models
init <- TRUE
save <- TRUEfacets_template <- list(
parent = NULL,
subba = NULL
)
start <- as.POSIXct(paste(
paste(
meta_json$start$year,
meta_json$start$month,
meta_json$start$day,
sep = "-"
),
" ",
meta_json$start$hour,
":00:00",
sep = ""
))
end <- as.POSIXct(paste(
paste(
meta_json$end$year,
meta_json$end$month,
meta_json$end$day,
sep = "-"
),
" ",
meta_json$end$hour,
":00:00",
sep = ""
))
attr(start, "tzone") <- tz
attr(end, "tzone") <- tz
eia_api_key <- Sys.getenv("EIA_API_KEY")metadata <- eia_metadata(api_key = eia_api_key, api_path = api_path)Warning: input string 'The api_key argument is missing... [0;92m❌[0m
' cannot be translated from 'ANSI_X3.4-1968' to UTF-8, but is valid UTF-8
Warning: input string 'The api_key argument is not valid... [0;92m❌[0m
' cannot be translated from 'ANSI_X3.4-1968' to UTF-8, but is valid UTF-8
Warning: input string 'The api_path argument is missing... [0;92m❌[0m
' cannot be translated from 'ANSI_X3.4-1968' to UTF-8, but is valid UTF-8
Warning: input string 'The api_path argument is not valid, must be a character object [0;92m❌[0m
' cannot be translated from 'ANSI_X3.4-1968' to UTF-8, but is valid UTF-8
Warning: input string 'Could not pull the metadata... [0;92m❌[0m
' cannot be translated from 'ANSI_X3.4-1968' to UTF-8, but is valid UTF-8
Warning: input string 'Could not parse the metadata JSON... [0;92m❌[0m
' cannot be translated from 'ANSI_X3.4-1968' to UTF-8, but is valid UTF-8
Warning: input string 'The api_key argument is missing... [0;92m❌[0m
' cannot be translated from 'ANSI_X3.4-1968' to UTF-8, but is valid UTF-8
Warning: input string 'The api_key argument is not valid... [0;92m❌[0m
' cannot be translated from 'ANSI_X3.4-1968' to UTF-8, but is valid UTF-8
Warning: input string 'The api_path argument is missing... [0;92m❌[0m
' cannot be translated from 'ANSI_X3.4-1968' to UTF-8, but is valid UTF-8
Warning: input string 'The api_path argument is not valid, must be a character object [0;92m❌[0m
' cannot be translated from 'ANSI_X3.4-1968' to UTF-8, but is valid UTF-8
Warning: input string 'Could not pull the metadata... [0;92m❌[0m
' cannot be translated from 'ANSI_X3.4-1968' to UTF-8, but is valid UTF-8
Warning: input string 'Could not parse the metadata JSON... [0;92m❌[0m
' cannot be translated from 'ANSI_X3.4-1968' to UTF-8, but is valid UTF-8
print(names(metadata)) [1] "id" "name" "description"
[4] "frequency" "facets" "data"
[7] "startPeriod" "endPeriod" "defaultDateFormat"
[10] "defaultFrequency" "command"
print(metadata$startPeriod)[1] "2018-06-19T05"
print(metadata$endPeriod)[1] "2024-07-14T07"
meta <- NULL
data <- NULL
for (i in 1:nrow(series)) {
facets <- facets_template
facets["parent"] <- series[i, "parent_id"]
facets["subba"] <- series[i, "subba_id"]
print(facets)
temp <- eia_backfill(
start = start,
end = end,
offset = offset,
api_key = eia_api_key,
api_path = paste(api_path, "data", sep = ""),
facets = facets
)
index <- seq.POSIXt(from = start, to = end, by = "hour")
ts_obj <- data.frame(period = index, subba = series[i, "subba_id"]) |>
left_join(temp, by = c("period" = "time", "subba"))
# Impute missing values
nas <- which(is.na(ts_obj$value))
ts_obj$type <- ifelse(is.na(ts_obj$value), "impute", "actual")
for (l in nas) {
if (l > 48) {
ts_obj$value[l] <- (ts_obj$value[l - 24] + ts_obj$value[l - 48]) / 2
}
}
meta_temp <- create_metadata(data = ts_obj, start = start, end = end, type = "backfill")
meta_temp$index <- 1
meta_df <- as.data.frame(meta_temp)
meta <- rbind(meta, meta_df)
data <- rbind(data, ts_obj)
}$parent
[1] "CISO"
$subba
[1] "PGAE"
$parent
[1] "CISO"
$subba
[1] "SCE"
$parent
[1] "CISO"
$subba
[1] "SDGE"
$parent
[1] "CISO"
$subba
[1] "VEA"
print(meta) index parent subba time start
1 1 CISO PGAE 2024-07-15 04:37:28 2018-07-01 08:00:00
2 1 CISO SCE 2024-07-15 04:37:54 2018-07-01 08:00:00
3 1 CISO SDGE 2024-07-15 04:38:22 2018-07-01 08:00:00
4 1 CISO VEA 2024-07-15 04:38:49 2018-07-01 08:00:00
end start_act end_act start_match
1 2024-07-08 01:00:00 2018-07-01 08:00:00 2024-07-08 01:00:00 TRUE
2 2024-07-08 01:00:00 2018-07-01 08:00:00 2024-07-08 01:00:00 TRUE
3 2024-07-08 01:00:00 2018-07-01 08:00:00 2024-07-08 01:00:00 TRUE
4 2024-07-08 01:00:00 2018-07-01 08:00:00 2024-07-08 01:00:00 TRUE
end_match n_obs na type update success comments
1 TRUE 52770 0 backfill FALSE FALSE
2 TRUE 52770 0 backfill FALSE FALSE
3 TRUE 52770 0 backfill FALSE FALSE
4 TRUE 52770 0 backfill FALSE FALSE
# The initial pull has some missing values
head(data) period subba subba_name parent
1 2018-07-01 08:00:00 PGAE Pacific Gas and Electric CISO
2 2018-07-01 09:00:00 PGAE Pacific Gas and Electric CISO
3 2018-07-01 10:00:00 PGAE Pacific Gas and Electric CISO
4 2018-07-01 11:00:00 PGAE Pacific Gas and Electric CISO
5 2018-07-01 12:00:00 PGAE Pacific Gas and Electric CISO
6 2018-07-01 13:00:00 PGAE Pacific Gas and Electric CISO
parent_name value value_units type
1 California Independent System Operator 12522 megawatthours actual
2 California Independent System Operator 11745 megawatthours actual
3 California Independent System Operator 11200 megawatthours actual
4 California Independent System Operator 10822 megawatthours actual
5 California Independent System Operator 10644 megawatthours actual
6 California Independent System Operator 10559 megawatthours actual
# Save the data
d <- append_data(data_path = data_path, new_data = data, init = TRUE, save = TRUE)[1] "Initial data pull"
[1] "Save the data to CSV file"
# Save the metadata
meta["success"] <- TRUE
meta["update"] <- TRUE
m <- append_metadata(meta_path = meta_path, new_meta = meta, init = TRUE, save = TRUE)[1] "Saving the metadata file"
Plot the Series
We will use Plotly to visualize the series:
end <- lubridate::floor_date((max(data$period)), unit = "day")
d <- data |>
arrange(subba, period) |>
dplyr::filter(period < end)
d$subba <- as.character(d$subba)
p <- plot_ly(d, x = ~period, y = ~value, color = ~subba, type = "scatter", mode = "lines")
pCreate an Inital Forecast
head(d) period subba subba_name parent
1 2018-07-01 08:00:00 PGAE Pacific Gas and Electric CISO
2 2018-07-01 09:00:00 PGAE Pacific Gas and Electric CISO
3 2018-07-01 10:00:00 PGAE Pacific Gas and Electric CISO
4 2018-07-01 11:00:00 PGAE Pacific Gas and Electric CISO
5 2018-07-01 12:00:00 PGAE Pacific Gas and Electric CISO
6 2018-07-01 13:00:00 PGAE Pacific Gas and Electric CISO
parent_name value value_units type
1 California Independent System Operator 12522 megawatthours actual
2 California Independent System Operator 11745 megawatthours actual
3 California Independent System Operator 11200 megawatthours actual
4 California Independent System Operator 10822 megawatthours actual
5 California Independent System Operator 10644 megawatthours actual
6 California Independent System Operator 10559 megawatthours actual
models_mapping <- readRDS(file = calibrated_models_path) |>
dplyr::select(subba, method)
fc <- NULL
fc <- refresh_forecast(
input = d,
forecast_log_path = forecast_log_path,
forecast_path = forecast_path,
models_settings = models_settings,
model_mapping = model_mapping,
h = h,
index = "period",
var = "value",
train_length = 24 * 31 * 25,
lags = lags,
init = init,
save = save,
seasonal = TRUE,
trend = TRUE
)Checking if new data points are available to refresh the forecast
New data is avaiable, starting the forecast refresh process
PGAE: Create a forecast
Add lag: 24
Add lag: 25
Add lag: 26
Add lag: 27
Add lag: 28
Add lag: 48
Add lag: 72
Add lag: 8760
Converting to Modeltime Table.
SCE: Create a forecast
Add lag: 24
Add lag: 25
Add lag: 26
Add lag: 27
Add lag: 28
Add lag: 48
Add lag: 72
Add lag: 8760
Converting to Modeltime Table.
SDGE: Create a forecast
Add lag: 24
Add lag: 25
Add lag: 26
Add lag: 27
Add lag: 28
Add lag: 48
Add lag: 72
Add lag: 8760
Converting to Modeltime Table.
VEA: Create a forecast
Add lag: 24
Add lag: 25
Add lag: 26
Add lag: 27
Add lag: 28
Add lag: 48
Add lag: 72
Add lag: 8760
Converting to Modeltime Table.
Initialize the forecast file
Save the forecast to ../data/forecast.csv
if (!is.null(fc)) {
p <- plot_forecast(
input = d,
forecast = fc,
hours = 24 * 4,
index = "period",
var = "value"
)
p
}